import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import accuracy_score
scalar=StandardScaler()
import warnings
warnings.filterwarnings('ignore')
customer=pd.read_csv('Mall_Customers.csv')
customer.size
1000
customer.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
customer.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 200 non-null int64 1 Gender 200 non-null object 2 Age 200 non-null int64 3 Annual Income (k$) 200 non-null int64 4 Spending Score (1-100) 200 non-null int64 dtypes: int64(4), object(1) memory usage: 7.9+ KB
customer.describe()
| CustomerID | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 100.500000 | 38.850000 | 60.560000 | 50.200000 |
| std | 57.879185 | 13.969007 | 26.264721 | 25.823522 |
| min | 1.000000 | 18.000000 | 15.000000 | 1.000000 |
| 25% | 50.750000 | 28.750000 | 41.500000 | 34.750000 |
| 50% | 100.500000 | 36.000000 | 61.500000 | 50.000000 |
| 75% | 150.250000 | 49.000000 | 78.000000 | 73.000000 |
| max | 200.000000 | 70.000000 | 137.000000 | 99.000000 |
plt.figure(figsize=(30,45))
for i,col in enumerate(customer.columns):
if customer[col].dtype != 'object':
ax = plt.subplot(9, 2, i+1)
sns.kdeplot(customer[col], ax=ax)
plt.xlabel(col)
plt.show()
plt.figure(figsize=(8,6))
sns.heatmap(customer.corr(), annot=True)
plt.show()
corr_matrix=customer.corr()
corr_matrix['Spending Score (1-100)'].sort_values(ascending=False)
Spending Score (1-100) 1.000000 CustomerID 0.013835 Annual Income (k$) 0.009903 Age -0.327227 Name: Spending Score (1-100), dtype: float64
x=customer.iloc[:,[3,4]].values
print(x)
[[ 15 39] [ 15 81] [ 16 6] [ 16 77] [ 17 40] [ 17 76] [ 18 6] [ 18 94] [ 19 3] [ 19 72] [ 19 14] [ 19 99] [ 20 15] [ 20 77] [ 20 13] [ 20 79] [ 21 35] [ 21 66] [ 23 29] [ 23 98] [ 24 35] [ 24 73] [ 25 5] [ 25 73] [ 28 14] [ 28 82] [ 28 32] [ 28 61] [ 29 31] [ 29 87] [ 30 4] [ 30 73] [ 33 4] [ 33 92] [ 33 14] [ 33 81] [ 34 17] [ 34 73] [ 37 26] [ 37 75] [ 38 35] [ 38 92] [ 39 36] [ 39 61] [ 39 28] [ 39 65] [ 40 55] [ 40 47] [ 40 42] [ 40 42] [ 42 52] [ 42 60] [ 43 54] [ 43 60] [ 43 45] [ 43 41] [ 44 50] [ 44 46] [ 46 51] [ 46 46] [ 46 56] [ 46 55] [ 47 52] [ 47 59] [ 48 51] [ 48 59] [ 48 50] [ 48 48] [ 48 59] [ 48 47] [ 49 55] [ 49 42] [ 50 49] [ 50 56] [ 54 47] [ 54 54] [ 54 53] [ 54 48] [ 54 52] [ 54 42] [ 54 51] [ 54 55] [ 54 41] [ 54 44] [ 54 57] [ 54 46] [ 57 58] [ 57 55] [ 58 60] [ 58 46] [ 59 55] [ 59 41] [ 60 49] [ 60 40] [ 60 42] [ 60 52] [ 60 47] [ 60 50] [ 61 42] [ 61 49] [ 62 41] [ 62 48] [ 62 59] [ 62 55] [ 62 56] [ 62 42] [ 63 50] [ 63 46] [ 63 43] [ 63 48] [ 63 52] [ 63 54] [ 64 42] [ 64 46] [ 65 48] [ 65 50] [ 65 43] [ 65 59] [ 67 43] [ 67 57] [ 67 56] [ 67 40] [ 69 58] [ 69 91] [ 70 29] [ 70 77] [ 71 35] [ 71 95] [ 71 11] [ 71 75] [ 71 9] [ 71 75] [ 72 34] [ 72 71] [ 73 5] [ 73 88] [ 73 7] [ 73 73] [ 74 10] [ 74 72] [ 75 5] [ 75 93] [ 76 40] [ 76 87] [ 77 12] [ 77 97] [ 77 36] [ 77 74] [ 78 22] [ 78 90] [ 78 17] [ 78 88] [ 78 20] [ 78 76] [ 78 16] [ 78 89] [ 78 1] [ 78 78] [ 78 1] [ 78 73] [ 79 35] [ 79 83] [ 81 5] [ 81 93] [ 85 26] [ 85 75] [ 86 20] [ 86 95] [ 87 27] [ 87 63] [ 87 13] [ 87 75] [ 87 10] [ 87 92] [ 88 13] [ 88 86] [ 88 15] [ 88 69] [ 93 14] [ 93 90] [ 97 32] [ 97 86] [ 98 15] [ 98 88] [ 99 39] [ 99 97] [101 24] [101 68] [103 17] [103 85] [103 23] [103 69] [113 8] [113 91] [120 16] [120 79] [126 28] [126 74] [137 18] [137 83]]
wcss=[]
for i in range(1,11):
kmeans=KMeans(n_clusters=i,init='k-means++',max_iter=400,random_state=42)
kmeans.fit(x)
wcss.append(kmeans.inertia_)
sns.set()
plt.plot(range(1,11),wcss,linewidth=3,markersize=8,marker='o',color='red')
plt.title("The elbow point graph")
plt.xlabel('number of cluster')
plt.ylabel('wcss')
plt.show()
# Taking number of cluster as 5 and applying k-mean model
kmeans=KMeans(n_clusters=5,init='k-means++',random_state=0)
labels=kmeans.fit_predict(x)
# counting number of customers in each segment
sns.countplot(labels)
<AxesSubplot:ylabel='count'>
plt.figure(figsize=(12,10))
plt.scatter(x[labels==0,0],x[labels==0,1],s=80,c='green',label='High Income-Less Spending')
plt.scatter(x[labels==1,0],x[labels==1,1],s=80,c='blue',label='Mid Income-Mid Spending')
plt.scatter(x[labels==2,0],x[labels==2,1],s=80,c='red',label='High Income-High Spending')
plt.scatter(x[labels==3,0],x[labels==3,1],s=80,c='orange',label='Low Income-High Spending')
plt.scatter(x[labels==4,0],x[labels==4,1],s=80,c='purple',label='Low Income-Low Spending')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=100,c='black',label='centroid')
plt.title("Customer Group")
plt.xlabel("Annual income(k$)")
plt.ylabel("Spending score")
plt.legend()
plt.show()
cluster_2d = pd.concat([customer,pd.DataFrame({'Cluster':kmeans.labels_})],axis=1)
cluster_2d
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 | 4 |
| 1 | 2 | Male | 21 | 15 | 81 | 3 |
| 2 | 3 | Female | 20 | 16 | 6 | 4 |
| 3 | 4 | Female | 23 | 16 | 77 | 3 |
| 4 | 5 | Female | 31 | 17 | 40 | 4 |
| ... | ... | ... | ... | ... | ... | ... |
| 195 | 196 | Female | 35 | 120 | 79 | 2 |
| 196 | 197 | Female | 45 | 126 | 28 | 0 |
| 197 | 198 | Male | 32 | 126 | 74 | 2 |
| 198 | 199 | Male | 32 | 137 | 18 | 0 |
| 199 | 200 | Male | 30 | 137 | 83 | 2 |
200 rows × 6 columns
datamaps_gender={
"Male":1,
"Female":0
}
cluster_2d['Gender']=cluster_2d['Gender'].map(datamaps_gender)
cluster_2d
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 19 | 15 | 39 | 4 |
| 1 | 2 | 1 | 21 | 15 | 81 | 3 |
| 2 | 3 | 0 | 20 | 16 | 6 | 4 |
| 3 | 4 | 0 | 23 | 16 | 77 | 3 |
| 4 | 5 | 0 | 31 | 17 | 40 | 4 |
| ... | ... | ... | ... | ... | ... | ... |
| 195 | 196 | 0 | 35 | 120 | 79 | 2 |
| 196 | 197 | 0 | 45 | 126 | 28 | 0 |
| 197 | 198 | 1 | 32 | 126 | 74 | 2 |
| 198 | 199 | 1 | 32 | 137 | 18 | 0 |
| 199 | 200 | 1 | 30 | 137 | 83 | 2 |
200 rows × 6 columns
cluster_2d.to_csv("Clustered_2d_Data.csv")
x = cluster_2d.drop(['Cluster'],axis=1)
Y= cluster_2d[['Cluster']]
x_train, x_test, Y_train, Y_test =train_test_split(x, Y, test_size=0.3,random_state=42)
ax = DecisionTreeClassifier()
# Train Decision Tree Classifer
ax = ax.fit(x_train,Y_train)
#Predict the response for test dataset
Y_pred = ax.predict(x_test)
print(metrics.confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))
[[13 0 0 0 0]
[ 2 24 1 0 0]
[ 0 0 9 0 0]
[ 0 1 0 5 0]
[ 0 0 0 0 5]]
precision recall f1-score support
0 0.87 1.00 0.93 13
1 0.96 0.89 0.92 27
2 0.90 1.00 0.95 9
3 1.00 0.83 0.91 6
4 1.00 1.00 1.00 5
accuracy 0.93 60
macro avg 0.95 0.94 0.94 60
weighted avg 0.94 0.93 0.93 60
accuracy_score(Y_test, Y_pred)
0.9333333333333333
X=customer.iloc[:,[2,3,4]].values
wcss=[]
for i in range(1,11):
kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
sns.set()
plt.plot(range(1,11),wcss,linewidth=3,markersize=10,marker='o',color='green')
plt.title("The elbow point graph")
plt.xlabel('number of cluster')
plt.ylabel('wcss')
plt.show()
model = KMeans(n_clusters = 5, init = "k-means++", max_iter = 400, n_init = 10, random_state = 0)
y_clusters = model.fit_predict(X)
sns.countplot(y_clusters)
<AxesSubplot:ylabel='count'>
print(X[y_clusters == 0,0][1])
print(X[y_clusters == 0,1][1])
print(X[y_clusters == 0,2][1])
20 16 6
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[y_clusters == 0,0],X[y_clusters == 0,1],X[y_clusters == 0,2], s = 60 , color = 'blue', label = "cluster 0")
ax.scatter(X[y_clusters == 1,0],X[y_clusters == 1,1],X[y_clusters == 1,2], s = 60 , color = 'orange', label = "cluster 1")
ax.scatter(X[y_clusters == 2,0],X[y_clusters == 2,1],X[y_clusters == 2,2], s = 60 , color = 'green', label = "cluster 2")
ax.scatter(X[y_clusters == 3,0],X[y_clusters == 3,1],X[y_clusters == 3,2], s = 60 , color = '#D12B60', label = "cluster 3")
ax.scatter(X[y_clusters == 4,0],X[y_clusters == 4,1],X[y_clusters == 4,2], s = 60 , color = 'purple', label = "cluster 4")
ax.set_xlabel('Age of a customer-->')
ax.set_ylabel('Anual Income-->')
ax.set_zlabel('Spending Score-->')
ax.legend()
plt.show()
import plotly.graph_objs as go
from plotly import tools
from plotly.subplots import make_subplots
import plotly.offline as py
Scene = dict(xaxis = dict(title = 'Age -->'),yaxis = dict(title = 'Spending Score--->'),zaxis = dict(title = 'Annual Income-->'))
# model.labels_ is nothing but the predicted clusters i.e y_clusters
labels = model.labels_
trace = go.Scatter3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
import joblib
joblib.dump(model,"cusomer_segmentation")
['cusomer_segmentation']
cluster_df = pd.concat([customer,pd.DataFrame({'Cluster':model.labels_})],axis=1)
cluster_df
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 | 0 |
| 1 | 2 | Male | 21 | 15 | 81 | 4 |
| 2 | 3 | Female | 20 | 16 | 6 | 0 |
| 3 | 4 | Female | 23 | 16 | 77 | 4 |
| 4 | 5 | Female | 31 | 17 | 40 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 195 | 196 | Female | 35 | 120 | 79 | 1 |
| 196 | 197 | Female | 45 | 126 | 28 | 3 |
| 197 | 198 | Male | 32 | 126 | 74 | 1 |
| 198 | 199 | Male | 32 | 137 | 18 | 3 |
| 199 | 200 | Male | 30 | 137 | 83 | 1 |
200 rows × 6 columns
datamap_gender={
"Male":1,
"Female":0
}
cluster_df['Gender']=cluster_df['Gender'].map(datamap_gender)
cluster_df
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 19 | 15 | 39 | 0 |
| 1 | 2 | 1 | 21 | 15 | 81 | 4 |
| 2 | 3 | 0 | 20 | 16 | 6 | 0 |
| 3 | 4 | 0 | 23 | 16 | 77 | 4 |
| 4 | 5 | 0 | 31 | 17 | 40 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 195 | 196 | 0 | 35 | 120 | 79 | 1 |
| 196 | 197 | 0 | 45 | 126 | 28 | 3 |
| 197 | 198 | 1 | 32 | 126 | 74 | 1 |
| 198 | 199 | 1 | 32 | 137 | 18 | 3 |
| 199 | 200 | 1 | 30 | 137 | 83 | 1 |
200 rows × 6 columns
cluster_df.to_csv("Clustered_Customer_Data.csv")
X = cluster_df.drop(['Cluster'],axis=1)
y= cluster_df[['Cluster']]
X_train, X_test, y_train, y_test =train_test_split(X, y_clusters, test_size=0.3,random_state=42)
X_test
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 95 | 96 | 1 | 24 | 60 | 52 |
| 15 | 16 | 1 | 22 | 20 | 79 |
| 30 | 31 | 1 | 60 | 30 | 4 |
| 158 | 159 | 1 | 34 | 78 | 1 |
| 128 | 129 | 1 | 59 | 71 | 11 |
| 115 | 116 | 0 | 19 | 65 | 50 |
| 69 | 70 | 0 | 32 | 48 | 47 |
| 170 | 171 | 1 | 40 | 87 | 13 |
| 174 | 175 | 0 | 52 | 88 | 13 |
| 45 | 46 | 0 | 24 | 39 | 65 |
| 66 | 67 | 0 | 43 | 48 | 50 |
| 182 | 183 | 1 | 46 | 98 | 15 |
| 165 | 166 | 0 | 36 | 85 | 75 |
| 78 | 79 | 0 | 23 | 54 | 52 |
| 186 | 187 | 0 | 54 | 101 | 24 |
| 177 | 178 | 1 | 27 | 88 | 69 |
| 56 | 57 | 0 | 51 | 44 | 50 |
| 152 | 153 | 0 | 44 | 78 | 20 |
| 82 | 83 | 1 | 67 | 54 | 41 |
| 68 | 69 | 1 | 19 | 48 | 59 |
| 124 | 125 | 0 | 23 | 70 | 29 |
| 16 | 17 | 0 | 35 | 21 | 35 |
| 148 | 149 | 0 | 34 | 78 | 22 |
| 93 | 94 | 0 | 40 | 60 | 40 |
| 65 | 66 | 1 | 18 | 48 | 59 |
| 60 | 61 | 1 | 70 | 46 | 56 |
| 84 | 85 | 0 | 21 | 54 | 57 |
| 67 | 68 | 0 | 68 | 48 | 48 |
| 125 | 126 | 0 | 31 | 70 | 77 |
| 132 | 133 | 0 | 25 | 72 | 34 |
| 9 | 10 | 0 | 30 | 19 | 72 |
| 18 | 19 | 1 | 52 | 23 | 29 |
| 55 | 56 | 1 | 47 | 43 | 41 |
| 75 | 76 | 1 | 26 | 54 | 54 |
| 150 | 151 | 1 | 43 | 78 | 17 |
| 104 | 105 | 1 | 49 | 62 | 56 |
| 135 | 136 | 0 | 29 | 73 | 88 |
| 137 | 138 | 1 | 32 | 73 | 73 |
| 164 | 165 | 1 | 50 | 85 | 26 |
| 76 | 77 | 0 | 45 | 54 | 53 |
| 79 | 80 | 0 | 49 | 54 | 42 |
| 197 | 198 | 1 | 32 | 126 | 74 |
| 38 | 39 | 0 | 36 | 37 | 26 |
| 24 | 25 | 0 | 54 | 28 | 14 |
| 122 | 123 | 0 | 40 | 69 | 58 |
| 195 | 196 | 0 | 35 | 120 | 79 |
| 29 | 30 | 0 | 23 | 29 | 87 |
| 19 | 20 | 0 | 35 | 23 | 98 |
| 143 | 144 | 0 | 32 | 76 | 87 |
| 86 | 87 | 0 | 55 | 57 | 58 |
| 114 | 115 | 0 | 18 | 65 | 48 |
| 173 | 174 | 1 | 36 | 87 | 92 |
| 5 | 6 | 0 | 22 | 17 | 76 |
| 126 | 127 | 1 | 43 | 71 | 35 |
| 117 | 118 | 0 | 49 | 65 | 59 |
| 73 | 74 | 0 | 60 | 50 | 56 |
| 140 | 141 | 0 | 57 | 75 | 5 |
| 98 | 99 | 1 | 48 | 61 | 42 |
| 172 | 173 | 1 | 36 | 87 | 10 |
| 96 | 97 | 0 | 47 | 60 | 47 |
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[ 5 0 0 0 0]
[ 0 9 0 0 0]
[ 0 1 24 1 0]
[ 0 0 0 14 0]
[ 0 0 0 0 6]]
precision recall f1-score support
0 1.00 1.00 1.00 5
1 0.90 1.00 0.95 9
2 1.00 0.92 0.96 26
3 0.93 1.00 0.97 14
4 1.00 1.00 1.00 6
accuracy 0.97 60
macro avg 0.97 0.98 0.97 60
weighted avg 0.97 0.97 0.97 60
accuracy_score(y_test, y_pred)
0.9666666666666667